In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *

%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)


/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/matplotlib/__init__.py:1350: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

In [2]:
df = pd.read_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), low_memory=False)

In [3]:
df = df.drop_duplicates('id', keep='last')[['id', 'id_str', 'text']]
df.id == df.id_str
(df.id != df.id_str).sum()


Out[3]:
0

In [4]:
df = df[['id', 'text']]

In [5]:
df.text


Out[5]:
0         #python never stop learning what you enjoy doi...
1         Watching Boa vs. Python — https://t.co/Pivpk02s2A
2         Monty Python - The silly walk https://t.co/C0J...
3         Senior Software Engineer Full Stack Python Dja...
4         Architect Django Solr Platform Engineer With P...
5                     peaceful rain? Python - inevitability
                                ...                        
183064    Las 3 mejores ides para Python Antes de empeza...
183065    Gagal tidur gegara habis vertical limit ada fi...
183066    Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...
183067    RT @RealPython: List of Python API Wrappers &g...
183068    Watching Boa vs. Python — https://t.co/5THbrirfQO
183069    Чертова дюжина вакансий в IT и Digital /  / 1....
Name: text, dtype: object

In [6]:
df['tokens'] = df.text.str.split()
df


Out[6]:
id text tokens
0 724276510626979840 #python never stop learning what you enjoy doi... [#python, never, stop, learning, what, you, en...
1 724276498249572352 Watching Boa vs. Python — https://t.co/Pivpk02s2A [Watching, Boa, vs., Python, —, https://t.co/P...
2 724276388325412866 Monty Python - The silly walk https://t.co/C0J... [Monty, Python, -, The, silly, walk, https://t...
3 725078887005347840 Senior Software Engineer Full Stack Python Dja... [Senior, Software, Engineer, Full, Stack, Pyth...
4 725078874338541572 Architect Django Solr Platform Engineer With P... [Architect, Django, Solr, Platform, Engineer, ...
5 725078868802068481 peaceful rain? Python - inevitability [peaceful, rain?, Python, -, inevitability]
... ... ... ...
183064 724275847591546880 Las 3 mejores ides para Python Antes de empeza... [Las, 3, mejores, ides, para, Python, Antes, d...
183065 724275810777985026 Gagal tidur gegara habis vertical limit ada fi... [Gagal, tidur, gegara, habis, vertical, limit,...
183066 724275650043875328 Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/... [Go, boa, wkwk💪😄, ★, Boa, vs., Python, —, http...
183067 724275609858392066 RT @RealPython: List of Python API Wrappers &g... [RT, @RealPython:, List, of, Python, API, Wrap...
183068 724275578879111169 Watching Boa vs. Python — https://t.co/5THbrirfQO [Watching, Boa, vs., Python, —, https://t.co/5...
183069 724275568871673857 Чертова дюжина вакансий в IT и Digital / / 1.... [Чертова, дюжина, вакансий, в, IT, и, Digital,...

183070 rows × 3 columns


In [7]:
df['tokens'] = df.text.str.replace(url, '').str.split()
df


Out[7]:
id text tokens
0 724276510626979840 #python never stop learning what you enjoy doi... [#python, never, stop, learning, what, you, en...
1 724276498249572352 Watching Boa vs. Python — https://t.co/Pivpk02s2A [Watching, Boa, vs., Python, —]
2 724276388325412866 Monty Python - The silly walk https://t.co/C0J... [Monty, Python, -, The, silly, walk, via, @You...
3 725078887005347840 Senior Software Engineer Full Stack Python Dja... [Senior, Software, Engineer, Full, Stack, Pyth...
4 725078874338541572 Architect Django Solr Platform Engineer With P... [Architect, Django, Solr, Platform, Engineer, ...
5 725078868802068481 peaceful rain? Python - inevitability [peaceful, rain?, Python, -, inevitability]
... ... ... ...
183064 724275847591546880 Las 3 mejores ides para Python Antes de empeza... [Las, 3, mejores, ides, para, Python, Antes, d...
183065 724275810777985026 Gagal tidur gegara habis vertical limit ada fi... [Gagal, tidur, gegara, habis, vertical, limit,...
183066 724275650043875328 Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/... [Go, boa, wkwk💪😄, ★, Boa, vs., Python, —]
183067 724275609858392066 RT @RealPython: List of Python API Wrappers &g... [RT, @RealPython:, List, of, Python, API, Wrap...
183068 724275578879111169 Watching Boa vs. Python — https://t.co/5THbrirfQO [Watching, Boa, vs., Python, —]
183069 724275568871673857 Чертова дюжина вакансий в IT и Digital / / 1.... [Чертова, дюжина, вакансий, в, IT, и, Digital,...

183070 rows × 3 columns


In [8]:
df['txt'] = df.text.str.replace(url, ' ').str.replace(r'\W+', ' ').str.replace(r'\s+', ' ')
df.txt


Out[8]:
0          python never stop learning what you enjoy doing 
1                                   Watching Boa vs Python 
2                   Monty Python The silly walk via YouTube
3         Senior Software Engineer Full Stack Python Dja...
4         Architect Django Solr Platform Engineer With P...
5                        peaceful rain Python inevitability
                                ...                        
183064    Las 3 mejores ides para Python Antes de empeza...
183065    Gagal tidur gegara habis vertical limit ada fi...
183066                           Go boa wkwk Boa vs Python 
183067    RT RealPython List of Python API Wrappers gt g...
183068                              Watching Boa vs Python 
183069    Чертова дюжина вакансий в IT и Digital 1 Go ра...
Name: txt, dtype: object

In [9]:
df['txt'] = df.txt.str.replace(r'\d+', ' ').str.replace(r'\s+', ' ')
df['tokens'] = df.txt.str.split()
df


Out[9]:
id text tokens txt
0 724276510626979840 #python never stop learning what you enjoy doi... [python, never, stop, learning, what, you, enj... python never stop learning what you enjoy doing
1 724276498249572352 Watching Boa vs. Python — https://t.co/Pivpk02s2A [Watching, Boa, vs, Python] Watching Boa vs Python
2 724276388325412866 Monty Python - The silly walk https://t.co/C0J... [Monty, Python, The, silly, walk, via, YouTube] Monty Python The silly walk via YouTube
3 725078887005347840 Senior Software Engineer Full Stack Python Dja... [Senior, Software, Engineer, Full, Stack, Pyth... Senior Software Engineer Full Stack Python Dja...
4 725078874338541572 Architect Django Solr Platform Engineer With P... [Architect, Django, Solr, Platform, Engineer, ... Architect Django Solr Platform Engineer With P...
5 725078868802068481 peaceful rain? Python - inevitability [peaceful, rain, Python, inevitability] peaceful rain Python inevitability
... ... ... ... ...
183064 724275847591546880 Las 3 mejores ides para Python Antes de empeza... [Las, mejores, ides, para, Python, Antes, de, ... Las mejores ides para Python Antes de empezar ...
183065 724275810777985026 Gagal tidur gegara habis vertical limit ada fi... [Gagal, tidur, gegara, habis, vertical, limit,... Gagal tidur gegara habis vertical limit ada fi...
183066 724275650043875328 Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/... [Go, boa, wkwk, Boa, vs, Python] Go boa wkwk Boa vs Python
183067 724275609858392066 RT @RealPython: List of Python API Wrappers &g... [RT, RealPython, List, of, Python, API, Wrappe... RT RealPython List of Python API Wrappers gt g...
183068 724275578879111169 Watching Boa vs. Python — https://t.co/5THbrirfQO [Watching, Boa, vs, Python] Watching Boa vs Python
183069 724275568871673857 Чертова дюжина вакансий в IT и Digital / / 1.... [Чертова, дюжина, вакансий, в, IT, и, Digital,... Чертова дюжина вакансий в IT и Digital Go разр...

183070 rows × 4 columns

Notice that we trounced the hashtag #Python
That's not good.
Can you fix it?
Anything else we might be messing up?
what other punctuation marks have special meaning in Tweets


In [10]:
# improve on the "stopword" filters here
#
# :-) (ask me about a smilie lexicon)
# not-so-simple words? (ask me about a regex for compound words)
# python variables names with underscores? (regex)

In [11]:
f = os.path.join(DATA_PATH, 'text.csv.gz')
df.to_csv(f, encoding='utf8', compression='gzip', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)

In [12]:
import gzip
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = pd.read_csv(f)

Make sure you can read it back in!


In [13]:
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df


Out[13]:
id text tokens txt
0 724276510626979840 #python never stop learning what you enjoy doi... ['python', 'never', 'stop', 'learning', 'what'... python never stop learning what you enjoy doing
1 724276498249572352 Watching Boa vs. Python — https://t.co/Pivpk02s2A ['Watching', 'Boa', 'vs', 'Python'] Watching Boa vs Python
2 724276388325412866 Monty Python - The silly walk https://t.co/C0J... ['Monty', 'Python', 'The', 'silly', 'walk', 'v... Monty Python The silly walk via YouTube
3 725078887005347840 Senior Software Engineer Full Stack Python Dja... ['Senior', 'Software', 'Engineer', 'Full', 'St... Senior Software Engineer Full Stack Python Dja...
4 725078874338541572 Architect Django Solr Platform Engineer With P... ['Architect', 'Django', 'Solr', 'Platform', 'E... Architect Django Solr Platform Engineer With P...
5 725078868802068481 peaceful rain? Python - inevitability ['peaceful', 'rain', 'Python', 'inevitability'] peaceful rain Python inevitability
... ... ... ... ...
183064 724275847591546880 Las 3 mejores ides para Python Antes de empeza... ['Las', 'mejores', 'ides', 'para', 'Python', '... Las mejores ides para Python Antes de empezar ...
183065 724275810777985026 Gagal tidur gegara habis vertical limit ada fi... ['Gagal', 'tidur', 'gegara', 'habis', 'vertica... Gagal tidur gegara habis vertical limit ada fi...
183066 724275650043875328 Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/... ['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python'] Go boa wkwk Boa vs Python
183067 724275609858392066 RT @RealPython: List of Python API Wrappers &g... ['RT', 'RealPython', 'List', 'of', 'Python', '... RT RealPython List of Python API Wrappers gt g...
183068 724275578879111169 Watching Boa vs. Python — https://t.co/5THbrirfQO ['Watching', 'Boa', 'vs', 'Python'] Watching Boa vs Python
183069 724275568871673857 Чертова дюжина вакансий в IT и Digital / / 1.... ['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '... Чертова дюжина вакансий в IT и Digital Go разр...

183070 rows × 4 columns


In [ ]: